#!/bin/bash

set -o pipefail
#set -o errexit

export LC_ALL=C

export BASE_DATA_PATH=/home/mixagol/data/
export BASE_DATE=20120802
export CUR_DIR=${BASE_DATA_PATH}/7_aux/kegg/${BASE_DATE}

mkdir -p ${CUR_DIR}

##
## ID MAPPING
##

wget 'http://rest.kegg.jp/list/organism' -O ${CUR_DIR}/orgs_all

cat ${CUR_DIR}/orgs_all | grep 'Prokaryotes' > ${CUR_DIR}/orgs

rm -rf ${CUR_DIR}/tmp_orgs/
mkdir -p ${CUR_DIR}/tmp_orgs

for org in `cat ${CUR_DIR}/orgs | cut -f2`; do
    #wget 'http://rest.kegg.jp/list/'${org} -O ${CUR_DIR}/tmp_orgs/genes_${org}
    wget 'http://rest.kegg.jp/conv/ncbi-geneid/'${org} -O ${CUR_DIR}/tmp_orgs/kegg2ncbi_${org}
    wget 'http://rest.kegg.jp/conv/uniprot/'${org} -O ${CUR_DIR}/tmp_orgs/kegg2uniprot_${org}
done

pv ${CUR_DIR}/tmp_orgs/kegg2ncbi_*    | grep . | sed 's/\tncbi-geneid:/\t/g' | sort -t$'\t' -k1,1 -S1G -T . >  ${CUR_DIR}/kegg2ncbi.txt
pv ${CUR_DIR}/tmp_orgs/kegg2uniprot_* | grep . | sed 's/\tup:/\t/g'          | sort -t$'\t' -k1,1 -S1G -T . >  ${CUR_DIR}/kegg2uniprot.txt

pv ${CUR_DIR}/kegg2ncbi.txt | awk '{print $2"\t"$1}' | sort -t$'\t' -k1,1 -S1G -T . > ${CUR_DIR}/ncbi2kegg.txt

rm -rf ${CUR_DIR}/tmp_orgs/

join -t$'\t' \
    ${CUR_DIR}/kegg2ncbi.txt \
    ${CUR_DIR}/kegg2uniprot.txt \
    | cut -f2,3 \
    | sort -t$'\t' -k1,1 -S1G -T . \
    > ${CUR_DIR}/ncbi2uniprot.txt



##
## KEGG FUNCTIONS
##


##
## PATHWAYS
##




##
## GENOMES
##

rm -rf ${CUR_DIR}/tmp_orgs/
mkdir ${CUR_DIR}/tmp_orgs

for org in `cat ${CUR_DIR}/orgs | cut -f2`; do
    wget 'http://www.kegg.jp/entry/'${org} -O ${CUR_DIR}/tmp_orgs/genome_info_${org}
done

for f in ${CUR_DIR}/tmp_orgs/genome_info_*; do
    cat $f | 7_aux/kegg/analize_genome_html.py
done > ${CUR_DIR}/orgs_articles.txt 

rm -rf ${CUR_DIR}/tmp_orgs/
mkdir ${CUR_DIR}/tmp_orgs

cat ${CUR_DIR}/orgs_articles.txt | 7_aux/kegg/get_free_pmc_article.py > ${CUR_DIR}/orgs_articles_full.txt


rm -rf ${CUR_DIR}/tmp_orgs/


